vroniplagwikiaorg_de-20200216-history
Benutzer:Kybot/getbot.py
Beschreibung: Dient zum Laden beliebiger Seiten aus dem Wiki. Ausgabeformat 'XML' (meistens zumindest ;). Beispieloutput: dv_export.xml #!/usr/bin/python # -*- coding: utf-8 -*- #vpget.py -titleregex:"^Dv/Fragment.IVX0-9+" #vpget.py -titleregex:\w #vpget.py -nss:0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,110,112,113,500,501,902 -titleregex:\w #vpget.py -subns:Dv """ The following parameters are supported: &params; -nss Work on the specified namespaces. Separate multiple namespace numbers with commas. Example "-nss:0,2,4" -subns Work on the specified subnamespace. Example "-subns:Dv" All other parameters will be regarded as part of the title of a single page, and the bot will only work on that single page. """ __version__ = '$Id$' import wikipedia as pywikibot import pagegenerators import sys import codecs from category import * from xml.sax.saxutils import escape from xml.sax.saxutils import quoteattr # This is required for the text that is shown when you run this script # with the parameter -help. docuReplacements = { '&params;': pagegenerators.parameterHelp } class Config(object): def __init__(self): self.namespace = '' self.namespaceName = '' self.subnamespace = '' config = Config() _unpatched = pywikibot.Site.allpages def patch_allpages(self, start='!', namespace=None, includeredirects=True, throttle=True): return _unpatched(self, start=start, namespace=config.namespace, includeredirects=includeredirects, throttle=throttle) pywikibot.Site.allpages = patch_allpages class GetBot: def __init__(self, generator): """ Constructor. Parameters: * generator - The page generator that determines on which pages to work on. """ self.generator = generator def run(self): pywikibot.output(u"\n\n>>> \03{lightpurple}Writing file %s.xml\03{default} <<<" % config.namespaceName) ofile = codecs.open('_output/ns' + config.namespaceName + '.xml', 'w', 'utf-8') ofile.write() for page in self.generator: self.treat(page, ofile) ofile.write() ofile.close() def treat(self, page, ofile): """ Loads the given page """ try: # Load the page text = page.get() # Categories categories = '' for category in page.categories(): categories += '' + category.titleWithoutNamespace() + '' categories += '' # Version History versionHistory = '' lastSize = 0 history = page.getVersionHistory() for data in reversed(history): versionHistory += '' lastSize = data4 versionHistory += '' # Wiki Text normalizedText = '' + text.replace('', '').replace(' ', ' ').replace('&#', '*##*').replace('&', '&').replace('*##*', '&#') + '' # Wiki Atrributes normalizedAttributes = 'title=' + quoteattr(page.title()) + ' lastEditor=' + quoteattr(page.userName()) + ' editTime=' + quoteattr(page.editTime()) + ' latestRevision=' + quoteattr(str(page.latestRevision())) # Result exportText = '' + normalizedText + categories + versionHistory + '' ofile.write(exportText.encode('utf8')) except pywikibot.NoPage: pywikibot.output(u"Page %s does not exist; skipping." % page.aslink()) return except pywikibot.IsRedirectPage: pywikibot.output(u"Page %s is a redirect; skipping." % page.aslink()) return def main(): # The default namespace nss = 0 # Handle multiple namespaces for arg in pywikibot.handleArgs(): if arg.startswith("-nss:"): nss = arglen('-nss:'):.split(",") # Iterate namespaces for ns in nss: site = pywikibot.getSite() nsName = site.family.namespacesint(ns)'_default' if nsName None: nsName = 'Hauptnamensraum' config.namespace = ns config.namespaceName = nsName pywikibot.output(u"\n\n>>> \03{lightpurple}Getting pages from namespace %s\03{default} <<<" % nsName) # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() # The generator gives the pages that should be worked upon. gen = None # This temporary array is used to read the page title if one single # page to work on is specified by the arguments. pageTitleParts = [] # Parse command line arguments for arg in pywikibot.handleArgs(): if not arg.startswith("-nss:"): if arg.startswith("-subns:"): config.subnamespace = arglen('-subns:'): arg = '-titleregex:^' + arglen('-subns:'): + '/Fragment.IVX0-9+' # check if a standard argument like # -start:XYZ or -ref:Asdf was given. if not genFactory.handleArg(arg): pageTitleParts.append(arg) if pageTitleParts != []: # We will only work on a single page. pageTitle = ' '.join(pageTitleParts) page = pywikibot.Page(pywikibot.getSite(), pageTitle) gen = iter(page) if not gen: gen = genFactory.getCombinedGenerator() if gen: # The preloading generator is responsible for downloading multiple # pages from the wiki simultaneously. gen = pagegenerators.PreloadingGenerator(gen) bot = GetBot(gen) bot.run() else: pywikibot.showHelp() if __name__ "__main__": try: main() finally: pywikibot.stopme()